In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

import re    # import re module
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# plot charts inline
%matplotlib inline
In [2]:
## read dataset
kmv_data_RiskFactor= pd.read_csv(r'index_kmv_data_RiskFactor.csv')
kmv_data_RiskFactor.head()
Out[2]:
Unnamed: 0 Unnamed: 0.1 gvkey year edf actual_10k_path_raw risk_factor_text
0 0 0 1004 2012 0.468847 edgar/data/1750/0001047469-12-007300.txt ). The reduction in backlog from May 31, 2011 ...
1 1 1 1004 2013 0.024986 edgar/data/1750/0001047469-13-007797.txt ** The following is a description of the prin...
2 2 2 1004 2014 0.000689 edgar/data/1750/0001047469-14-006243.txt ** The following is a description of the prin...
3 3 3 1004 2015 0.015321 edgar/data/1750/0001047469-15-006136.txt ** The following is a description of the prin...
4 4 4 1004 2016 0.026049 edgar/data/1750/0001047469-16-014299.txt NaN
In [10]:
#drop rows with Null values
kmv_data_RiskFactor_dropna = kmv_data_RiskFactor.dropna(how='any')
risk_factor_text = kmv_data_RiskFactor_dropna['risk_factor_text'][0:200].tolist()
edf = kmv_data_RiskFactor_dropna['edf'][0:200]
#remove '\n' + ‘\\n’
risk_factor_text = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text))
In [4]:
##count_words function
def count_polar(sentiment, tokens):
    with open(sentiment,'r') as f:
        sentiment_words  = [line.strip() for line in f]

    sentiment_tokens = [token for token in tokens \
                     if token in sentiment_words]
    negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
    for idx, token in enumerate(tokens):
        if token in sentiment_words:
            if idx>0:
                if tokens[idx-1] not in negations:
                    sentiment_tokens.append(token)
            else:
                sentiment_tokens.append(token)
    return len(sentiment_tokens)
In [11]:
## Count precentage of positive words and negative words in each document
ls_positive_count = []
ls_negative_count = []
ls_positive_prec  = []
ls_negative_prec  = []
n = 0 
for text in risk_factor_text:
    text = str(text)
    n += 1
    if n % 100 == 0:
        print(n)
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    tokens=[token.strip(string.punctuation) for token in tokens\
           if token not in stop_words]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    positive_count = count_polar("positive-words.txt", tokens)
    negative_count = count_polar("negative-words.txt", tokens)
    positive_prec  = positive_count / (positive_count + negative_count + 0.000000000001)
    negative_prec  = negative_count / (positive_count + negative_count + 0.000000000001)
    
    
    ls_positive_count.append(positive_count)
    ls_negative_count.append(negative_count)
    ls_positive_prec.append(positive_prec)
    ls_negative_prec.append(negative_prec)
    
#print(ls_positive_count,ls_negative_count)
#print(ls_positive_prec, ls_negative_prec)

    
100
200
In [12]:
from scipy.stats import linregress
linregress(ls_negative_count,edf)
linregress(ls_positive_count,edf)
linregress(ls_negative_prec, edf)
linregress(ls_positive_prec, edf)
Out[12]:
LinregressResult(slope=-6.89614544208486e-06, intercept=0.05143096390515523, rvalue=-0.013627767916311054, pvalue=0.8481130253696473, stderr=3.5959095314200034e-05)
Out[12]:
LinregressResult(slope=6.834838553424016e-06, intercept=0.04677439087379673, rvalue=0.012504820609897912, pvalue=0.8604956414332507, stderr=3.884044896977195e-05)
Out[12]:
LinregressResult(slope=-0.034490007809650336, intercept=0.0667821067711464, rvalue=-0.03569743856283507, pvalue=0.6157845329186611, stderr=0.06861937435618914)
Out[12]:
LinregressResult(slope=0.040328454432028135, intercept=0.03149618787358385, rvalue=0.03633698193370003, pvalue=0.6094699342286296, stderr=0.07882123306274227)
In [18]:
plt.scatter(ls_negative_count,edf)
plt.scatter(ls_positive_count,edf, color = 'r')
Out[18]:
<matplotlib.collections.PathCollection at 0x1583ab55b00>
Out[18]:
<matplotlib.collections.PathCollection at 0x1583ab3d6a0>
In [15]:
plt.plot(range(0,200),ls_negative_prec[0:200])
plt.plot(range(0,200),edf[0:200], color = 'r')
Out[15]:
[<matplotlib.lines.Line2D at 0x1e538c818d0>]
Out[15]:
[<matplotlib.lines.Line2D at 0x1e538c8e080>]
In [16]:
negative_words = ['regulation', 'law', 'loss']
with open("negative-words.txt",'r') as f:
    sentiment_words  = [line.strip() for line in f]
negative_words += sentiment_words
negative_words
with open("positive-words.txt",'r') as f:
    positive_words  = [line.strip() for line in f]
Out[16]:
['regulation',
 'law',
 'loss',
 '2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',
 'afraid',
 'aggravate',
 'aggravating',
 'aggravation',
 'aggression',
 'aggressive',
 'aggressiveness',
 'aggressor',
 'aggrieve',
 'aggrieved',
 'aggrivation',
 'aghast',
 'agonies',
 'agonize',
 'agonizing',
 'agonizingly',
 'agony',
 'aground',
 'ail',
 'ailing',
 'ailment',
 'aimless',
 'alarm',
 'alarmed',
 'alarming',
 'alarmingly',
 'alienate',
 'alienated',
 'alienation',
 'allegation',
 'allegations',
 'allege',
 'allergic',
 'allergies',
 'allergy',
 'aloof',
 'altercation',
 'ambiguity',
 'ambiguous',
 'ambivalence',
 'ambivalent',
 'ambush',
 'amiss',
 'amputate',
 'anarchism',
 'anarchist',
 'anarchistic',
 'anarchy',
 'anemic',
 'anger',
 'angrily',
 'angriness',
 'angry',
 'anguish',
 'animosity',
 'annihilate',
 'annihilation',
 'annoy',
 'annoyance',
 'annoyances',
 'annoyed',
 'annoying',
 'annoyingly',
 'annoys',
 'anomalous',
 'anomaly',
 'antagonism',
 'antagonist',
 'antagonistic',
 'antagonize',
 'anti-',
 'anti-american',
 'anti-israeli',
 'anti-occupation',
 'anti-proliferation',
 'anti-semites',
 'anti-social',
 'anti-us',
 'anti-white',
 'antipathy',
 'antiquated',
 'antithetical',
 'anxieties',
 'anxiety',
 'anxious',
 'anxiously',
 'anxiousness',
 'apathetic',
 'apathetically',
 'apathy',
 'apocalypse',
 'apocalyptic',
 'apologist',
 'apologists',
 'appal',
 'appall',
 'appalled',
 'appalling',
 'appallingly',
 'apprehension',
 'apprehensions',
 'apprehensive',
 'apprehensively',
 'arbitrary',
 'arcane',
 'archaic',
 'arduous',
 'arduously',
 'argumentative',
 'arrogance',
 'arrogant',
 'arrogantly',
 'ashamed',
 'asinine',
 'asininely',
 'asinininity',
 'askance',
 'asperse',
 'aspersion',
 'aspersions',
 'assail',
 'assassin',
 'assassinate',
 'assault',
 'assult',
 'astray',
 'asunder',
 'atrocious',
 'atrocities',
 'atrocity',
 'atrophy',
 'attack',
 'attacks',
 'audacious',
 'audaciously',
 'audaciousness',
 'audacity',
 'audiciously',
 'austere',
 'authoritarian',
 'autocrat',
 'autocratic',
 'avalanche',
 'avarice',
 'avaricious',
 'avariciously',
 'avenge',
 'averse',
 'aversion',
 'aweful',
 'awful',
 'awfully',
 'awfulness',
 'awkward',
 'awkwardness',
 'ax',
 'babble',
 'back-logged',
 'back-wood',
 'back-woods',
 'backache',
 'backaches',
 'backaching',
 'backbite',
 'backbiting',
 'backward',
 'backwardness',
 'backwood',
 'backwoods',
 'bad',
 'badly',
 'baffle',
 'baffled',
 'bafflement',
 'baffling',
 'bait',
 'balk',
 'banal',
 'banalize',
 'bane',
 'banish',
 'banishment',
 'bankrupt',
 'barbarian',
 'barbaric',
 'barbarically',
 'barbarity',
 'barbarous',
 'barbarously',
 'barren',
 'baseless',
 'bash',
 'bashed',
 'bashful',
 'bashing',
 'bastard',
 'bastards',
 'battered',
 'battering',
 'batty',
 'bearish',
 'beastly',
 'bedlam',
 'bedlamite',
 'befoul',
 'beg',
 'beggar',
 'beggarly',
 'begging',
 'beguile',
 'belabor',
 'belated',
 'beleaguer',
 'belie',
 'belittle',
 'belittled',
 'belittling',
 'bellicose',
 'belligerence',
 'belligerent',
 'belligerently',
 'bemoan',
 'bemoaning',
 'bemused',
 'bent',
 'berate',
 'bereave',
 'bereavement',
 'bereft',
 'berserk',
 'beseech',
 'beset',
 'besiege',
 'besmirch',
 'bestial',
 'betray',
 'betrayal',
 'betrayals',
 'betrayer',
 'betraying',
 'betrays',
 'bewail',
 'beware',
 'bewilder',
 'bewildered',
 'bewildering',
 'bewilderingly',
 'bewilderment',
 'bewitch',
 'bias',
 'biased',
 'biases',
 'bicker',
 'bickering',
 'bid-rigging',
 'bigotries',
 'bigotry',
 'bitch',
 'bitchy',
 'biting',
 'bitingly',
 'bitter',
 'bitterly',
 'bitterness',
 'bizarre',
 'blab',
 'blabber',
 'blackmail',
 'blah',
 'blame',
 'blameworthy',
 'bland',
 'blandish',
 'blaspheme',
 'blasphemous',
 'blasphemy',
 'blasted',
 'blatant',
 'blatantly',
 'blather',
 'bleak',
 'bleakly',
 'bleakness',
 'bleed',
 'bleeding',
 'bleeds',
 'blemish',
 'blind',
 'blinding',
 'blindingly',
 'blindside',
 'blister',
 'blistering',
 'bloated',
 'blockage',
 'blockhead',
 'bloodshed',
 'bloodthirsty',
 'bloody',
 'blotchy',
 'blow',
 'blunder',
 'blundering',
 'blunders',
 'blunt',
 'blur',
 'bluring',
 'blurred',
 'blurring',
 'blurry',
 'blurs',
 'blurt',
 'boastful',
 'boggle',
 'bogus',
 'boil',
 'boiling',
 'boisterous',
 'bomb',
 'bombard',
 'bombardment',
 'bombastic',
 'bondage',
 'bonkers',
 'bore',
 'bored',
 'boredom',
 'bores',
 'boring',
 'botch',
 'bother',
 'bothered',
 'bothering',
 'bothers',
 'bothersome',
 'bowdlerize',
 'boycott',
 'braggart',
 'bragger',
 'brainless',
 'brainwash',
 'brash',
 'brashly',
 'brashness',
 'brat',
 'bravado',
 'brazen',
 'brazenly',
 'brazenness',
 'breach',
 'break',
 'break-up',
 'break-ups',
 'breakdown',
 'breaking',
 'breaks',
 'breakup',
 'breakups',
 'bribery',
 'brimstone',
 'bristle',
 'brittle',
 'broke',
 'broken',
 'broken-hearted',
 'brood',
 'browbeat',
 'bruise',
 'bruised',
 'bruises',
 'bruising',
 'brusque',
 'brutal',
 'brutalising',
 'brutalities',
 'brutality',
 'brutalize',
 'brutalizing',
 'brutally',
 'brute',
 'brutish',
 'bs',
 'buckle',
 'bug',
 'bugging',
 'buggy',
 'bugs',
 'bulkier',
 'bulkiness',
 'bulky',
 'bulkyness',
 'bull****',
 'bull----',
 'bullies',
 'bullshit',
 'bullshyt',
 'bully',
 'bullying',
 'bullyingly',
 'bum',
 'bump',
 'bumped',
 'bumping',
 'bumpping',
 'bumps',
 'bumpy',
 'bungle',
 'bungler',
 'bungling',
 'bunk',
 'burden',
 'burdensome',
 'burdensomely',
 'burn',
 'burned',
 'burning',
 'burns',
 'bust',
 'busts',
 'busybody',
 'butcher',
 'butchery',
 'buzzing',
 'byzantine',
 'cackle',
 'calamities',
 'calamitous',
 'calamitously',
 'calamity',
 'callous',
 'calumniate',
 'calumniation',
 'calumnies',
 'calumnious',
 'calumniously',
 'calumny',
 'cancer',
 'cancerous',
 'cannibal',
 'cannibalize',
 'capitulate',
 'capricious',
 'capriciously',
 'capriciousness',
 'capsize',
 'careless',
 'carelessness',
 'caricature',
 'carnage',
 'carp',
 'cartoonish',
 'cash-strapped',
 'castigate',
 'castrated',
 'casualty',
 'cataclysm',
 'cataclysmal',
 'cataclysmic',
 'cataclysmically',
 'catastrophe',
 'catastrophes',
 'catastrophic',
 'catastrophically',
 'catastrophies',
 'caustic',
 'caustically',
 'cautionary',
 'cave',
 'censure',
 'chafe',
 'chaff',
 'chagrin',
 'challenging',
 'chaos',
 'chaotic',
 'chasten',
 'chastise',
 'chastisement',
 'chatter',
 'chatterbox',
 'cheap',
 'cheapen',
 'cheaply',
 'cheat',
 'cheated',
 'cheater',
 'cheating',
 'cheats',
 'checkered',
 'cheerless',
 'cheesy',
 'chide',
 'childish',
 'chill',
 'chilly',
 'chintzy',
 'choke',
 'choleric',
 'choppy',
 'chore',
 'chronic',
 'chunky',
 'clamor',
 'clamorous',
 'clash',
 'cliche',
 'cliched',
 'clique',
 'clog',
 'clogged',
 'clogs',
 'cloud',
 'clouding',
 'cloudy',
 'clueless',
 'clumsy',
 'clunky',
 'coarse',
 'cocky',
 'coerce',
 'coercion',
 'coercive',
 'cold',
 'coldly',
 'collapse',
 'collude',
 'collusion',
 'combative',
 'combust',
 'comical',
 'commiserate',
 'commonplace',
 'commotion',
 'commotions',
 'complacent',
 'complain',
 'complained',
 'complaining',
 'complains',
 'complaint',
 'complaints',
 'complex',
 'complicated',
 'complication',
 'complicit',
 'compulsion',
 'compulsive',
 'concede',
 'conceded',
 'conceit',
 'conceited',
 'concen',
 'concens',
 'concern',
 'concerned',
 'concerns',
 'concession',
 'concessions',
 'condemn',
 'condemnable',
 'condemnation',
 'condemned',
 'condemns',
 'condescend',
 'condescending',
 'condescendingly',
 'condescension',
 'confess',
 'confession',
 'confessions',
 'confined',
 'conflict',
 'conflicted',
 'conflicting',
 'conflicts',
 'confound',
 'confounded',
 'confounding',
 'confront',
 'confrontation',
 'confrontational',
 'confuse',
 'confused',
 'confuses',
 'confusing',
 'confusion',
 'confusions',
 'congested',
 'congestion',
 'cons',
 'conscons',
 'conservative',
 'conspicuous',
 'conspicuously',
 'conspiracies',
 'conspiracy',
 'conspirator',
 'conspiratorial',
 'conspire',
 'consternation',
 'contagious',
 'contaminate',
 'contaminated',
 'contaminates',
 'contaminating',
 'contamination',
 'contempt',
 'contemptible',
 'contemptuous',
 'contemptuously',
 'contend',
 'contention',
 'contentious',
 'contort',
 'contortions',
 'contradict',
 'contradiction',
 'contradictory',
 'contrariness',
 'contravene',
 'contrive',
 'contrived',
 'controversial',
 'controversy',
 'convoluted',
 'corrode',
 'corrosion',
 'corrosions',
 'corrosive',
 'corrupt',
 'corrupted',
 'corrupting',
 'corruption',
 'corrupts',
 'corruptted',
 'costlier',
 'costly',
 'counter-productive',
 'counterproductive',
 'coupists',
 'covetous',
 'coward',
 'cowardly',
 'crabby',
 'crack',
 'cracked',
 'cracks',
 'craftily',
 'craftly',
 'crafty',
 'cramp',
 'cramped',
 'cramping',
 'cranky',
 'crap',
 'crappy',
 'craps',
 'crash',
 'crashed',
 'crashes',
 'crashing',
 'crass',
 'craven',
 'cravenly',
 'craze',
 'crazily',
 'craziness',
 'crazy',
 'creak',
 'creaking',
 'creaks',
 'credulous',
 'creep',
 'creeping',
 'creeps',
 'creepy',
 'crept',
 'crime',
 'criminal',
 'cringe',
 'cringed',
 'cringes',
 'cripple',
 'crippled',
 'cripples',
 'crippling',
 'crisis',
 'critic',
 'critical',
 'criticism',
 'criticisms',
 'criticize',
 'criticized',
 'criticizing',
 'critics',
 'cronyism',
 'crook',
 'crooked',
 'crooks',
 'crowded',
 'crowdedness',
 'crude',
 'cruel',
 'crueler',
 'cruelest',
 'cruelly',
 'cruelness',
 'cruelties',
 'cruelty',
 'crumble',
 'crumbling',
 'crummy',
 'crumple',
 'crumpled',
 'crumples',
 'crush',
 'crushed',
 'crushing',
 'cry',
 'culpable',
 'culprit',
 'cumbersome',
 'cunt',
 'cunts',
 'cuplrit',
 'curse',
 'cursed',
 'curses',
 'curt',
 'cuss',
 'cussed',
 'cutthroat',
 'cynical',
 'cynicism',
 'd*mn',
 'damage',
 'damaged',
 'damages',
 'damaging',
 'damn',
 'damnable',
 'damnably',
 'damnation',
 'damned',
 'damning',
 'damper',
 'danger',
 'dangerous',
 'dangerousness',
 'dark',
 'darken',
 'darkened',
 'darker',
 'darkness',
 'dastard',
 'dastardly',
 'daunt',
 'daunting',
 'dauntingly',
 'dawdle',
 'daze',
 'dazed',
 'dead',
 'deadbeat',
 'deadlock',
 'deadly',
 'deadweight',
 'deaf',
 'dearth',
 'death',
 'debacle',
 'debase',
 'debasement',
 'debaser',
 'debatable',
 'debauch',
 'debaucher',
 'debauchery',
 'debilitate',
 'debilitating',
 'debility',
 'debt',
 'debts',
 'decadence',
 'decadent',
 'decay',
 'decayed',
 'deceit',
 'deceitful',
 'deceitfully',
 'deceitfulness',
 'deceive',
 'deceiver',
 'deceivers',
 'deceiving',
 'deception',
 'deceptive',
 'deceptively',
 'declaim',
 'decline',
 'declines',
 'declining',
 'decrement',
 'decrepit',
 'decrepitude',
 'decry',
 'defamation',
 'defamations',
 'defamatory',
 'defame',
 'defect',
 'defective',
 'defects',
 'defensive',
 'defiance',
 'defiant',
 'defiantly',
 'deficiencies',
 'deficiency',
 'deficient',
 'defile',
 'defiler',
 'deform',
 'deformed',
 'defrauding',
 'defunct',
 'defy',
 'degenerate',
 'degenerately',
 'degeneration',
 'degradation',
 'degrade',
 'degrading',
 'degradingly',
 'dehumanization',
 'dehumanize',
 'deign',
 'deject',
 'dejected',
 'dejectedly',
 'dejection',
 'delay',
 'delayed',
 'delaying',
 'delays',
 'delinquency',
 'delinquent',
 'delirious',
 'delirium',
 'delude',
 'deluded',
 'deluge',
 'delusion',
 'delusional',
 'delusions',
 'demean',
 'demeaning',
 'demise',
 'demolish',
 'demolisher',
 'demon',
 'demonic',
 'demonize',
 'demonized',
 'demonizes',
 'demonizing',
 'demoralize',
 'demoralizing',
 'demoralizingly',
 'denial',
 'denied',
 'denies',
 'denigrate',
 'denounce',
 'dense',
 'dent',
 'dented',
 'dents',
 'denunciate',
 'denunciation',
 'denunciations',
 'deny',
 'denying',
 'deplete',
 'deplorable',
 'deplorably',
 'deplore',
 'deploring',
 'deploringly',
 'deprave',
 'depraved',
 'depravedly',
 'deprecate',
 'depress',
 'depressed',
 'depressing',
 'depressingly',
 'depression',
 'depressions',
 'deprive',
 'deprived',
 'deride',
 'derision',
 'derisive',
 'derisively',
 'derisiveness',
 'derogatory',
 'desecrate',
 'desert',
 'desertion',
 'desiccate',
 'desiccated',
 'desititute',
 'desolate',
 'desolately',
 'desolation',
 'despair',
 'despairing',
 'despairingly',
 'desperate',
 'desperately',
 ...]
In [17]:
##count_words function
def count_polar2 (sentiment_words, tokens):
    sentiment_tokens = [token for token in tokens \
                     if token in sentiment_words]
    negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
    for idx, token in enumerate(tokens):
        if token in sentiment_words:
            if idx>0:
                if tokens[idx-1] not in negations:
                    sentiment_tokens.append(token)
            else:
                sentiment_tokens.append(token)
    return len(sentiment_tokens)
In [18]:
## add high frequency words
def get_count(risk_factor_text):
    ls_positive_count = []
    ls_negative_count = []
    ls_positive_prec  = []
    ls_negative_prec  = []
    for text in risk_factor_text:
        text = str(text)
        stop_words = stopwords.words('english')
        stop_words+=["risks", "risk", "competitive"]

        tokens = re.split(r"\W+", text)
        tokens = nltk.word_tokenize(text)   

        tokens=[token.strip(string.punctuation) for token in tokens\
               if token not in stop_words]
        tokens=[token.strip() for token in tokens if token.strip()!='']


        positive_count = count_polar2(positive_words, tokens)

        negative_count = count_polar2(negative_words, tokens)
        positive_prec  = positive_count / (positive_count + negative_count + 0.000000000001)
        negative_prec  = negative_count / (positive_count + negative_count + 0.000000000001)


        ls_positive_count.append(positive_count)
        ls_negative_count.append(negative_count)
        ls_positive_prec.append(positive_prec)
        ls_negative_prec.append(negative_prec)

print(ls_positive_count,ls_negative_count)
print(ls_positive_prec, ls_negative_prec)

    
[142, 128, 170, 182, 626, 612, 600, 572, 698, 254, 268, 260, 256, 284, 398, 388, 146, 248, 336, 112, 130, 132, 132, 136, 266, 294, 342, 328, 388, 98, 132, 116, 174, 168, 860, 934, 928, 904, 994, 0, 0, 0, 0, 0, 112, 106, 124, 162, 136, 308, 336, 426, 468, 500, 170, 226, 220, 202, 246, 240, 252, 288, 106, 104, 208, 422, 396, 252, 536, 572, 574, 202, 260, 274, 120, 122, 0, 120, 138, 134, 130, 200, 414, 386, 426, 358, 494, 444, 436, 442, 446, 198, 216, 202, 280, 274, 350, 388, 408, 296, 268, 538, 532, 596, 612, 452, 350, 410, 480, 532, 542, 764, 678, 760, 646, 662, 332, 324, 310, 242, 246, 338, 340, 294, 292, 244, 68, 64, 54, 57, 182, 78, 82, 88, 102, 198, 130, 132, 136, 186, 188, 1390, 1564, 818, 888, 788, 30, 30, 30, 70, 126, 332, 406, 0, 0, 0, 334, 346, 366, 362, 440, 138, 148, 156, 152, 154, 374, 294, 274, 294, 284, 186, 178, 182, 184, 186, 400, 394, 390, 386, 388, 156, 180, 190, 410, 420, 368, 364, 308, 300, 378, 100, 132, 132, 140, 376, 482, 466, 486, 0] [206, 202, 246, 220, 590, 552, 562, 574, 606, 294, 314, 320, 304, 296, 264, 240, 192, 314, 426, 230, 252, 244, 232, 246, 378, 424, 456, 466, 510, 114, 148, 138, 164, 162, 810, 814, 854, 862, 982, 0, 0, 0, 0, 0, 192, 202, 216, 254, 254, 384, 346, 352, 364, 346, 270, 354, 354, 294, 392, 378, 394, 466, 142, 164, 234, 284, 288, 470, 1048, 1090, 1114, 238, 288, 300, 148, 160, 0, 160, 192, 188, 182, 294, 396, 372, 372, 304, 598, 592, 580, 606, 600, 274, 270, 270, 362, 354, 398, 372, 386, 318, 294, 722, 710, 724, 738, 646, 412, 506, 574, 608, 664, 762, 742, 922, 888, 900, 524, 434, 436, 374, 378, 392, 386, 364, 336, 274, 52, 60, 56, 52, 118, 82, 86, 76, 90, 160, 120, 124, 120, 176, 174, 1326, 1454, 912, 920, 816, 56, 56, 54, 90, 148, 416, 490, 0, 0, 0, 374, 398, 456, 460, 508, 224, 254, 294, 302, 306, 346, 266, 258, 390, 424, 198, 188, 180, 194, 152, 342, 332, 332, 340, 420, 198, 266, 272, 310, 326, 436, 484, 412, 368, 382, 158, 186, 186, 206, 568, 696, 714, 710, 0]
[0.40804597701149303, 0.38787878787878666, 0.40865384615384515, 0.452736318407959, 0.514802631578947, 0.5257731958762882, 0.5163511187607569, 0.4991273996509595, 0.5352760736196315, 0.46350364963503565, 0.4604810996563566, 0.44827586206896475, 0.4571428571428563, 0.48965517241379225, 0.6012084592145006, 0.6178343949044576, 0.4319526627218922, 0.4412811387900348, 0.44094488188976316, 0.3274853801169581, 0.34031413612565353, 0.3510638297872331, 0.3626373626373616, 0.356020942408376, 0.4130434782608689, 0.4094707520891359, 0.42857142857142805, 0.4130982367758181, 0.43207126948775004, 0.46226415094339407, 0.4714285714285697, 0.45669291338582496, 0.5147928994082824, 0.5090909090909075, 0.5149700598802393, 0.5343249427917617, 0.5207631874298538, 0.511891279728199, 0.5030364372469633, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3684210526315777, 0.344155844155843, 0.3647058823529401, 0.389423076923076, 0.3487179487179478, 0.4450867052023115, 0.49266862170087905, 0.5475578406169659, 0.5624999999999993, 0.5910165484633563, 0.38636363636363547, 0.38965517241379244, 0.3832752613240411, 0.40725806451612817, 0.38557993730407464, 0.3883495145631062, 0.39009287925696534, 0.3819628647214849, 0.42741935483870797, 0.3880597014925358, 0.47058823529411653, 0.5977337110481578, 0.5789473684210518, 0.3490304709141269, 0.3383838383838382, 0.34416365824308043, 0.34004739336492873, 0.45909090909090805, 0.47445255474452463, 0.47735191637630575, 0.44776119402984904, 0.4326241134751757, 0.0, 0.428571428571427, 0.4181818181818169, 0.41614906832298004, 0.4166666666666653, 0.40485829959514086, 0.5111111111111105, 0.5092348284960415, 0.5338345864661648, 0.5407854984894251, 0.452380952380952, 0.4285714285714282, 0.4291338582677161, 0.4217557251908393, 0.4263862332695981, 0.4194915254237279, 0.44444444444444353, 0.42796610169491434, 0.43613707165108967, 0.4363057324840757, 0.46791443850267317, 0.510526315789473, 0.5138539042821152, 0.48208469055374514, 0.47686832740213436, 0.4269841269841267, 0.4283413848631237, 0.4515151515151512, 0.45333333333333303, 0.41165755919854247, 0.45931758530183664, 0.44759825327510866, 0.4554079696394683, 0.4666666666666663, 0.4494195688225536, 0.5006553079947572, 0.47746478873239406, 0.4518430439952435, 0.42112125162972597, 0.42381562099871933, 0.38785046728971917, 0.427440633245382, 0.4155495978552273, 0.3928571428571422, 0.3942307692307686, 0.46301369863013636, 0.4683195592286495, 0.4468085106382972, 0.46496815286624127, 0.4710424710424701, 0.566666666666662, 0.5161290322580604, 0.49090909090908647, 0.522935779816509, 0.6066666666666646, 0.487499999999997, 0.4880952380952352, 0.5365853658536552, 0.5312499999999972, 0.5530726256983225, 0.5199999999999979, 0.5156249999999979, 0.5312499999999979, 0.5138121546961312, 0.5193370165745842, 0.511782032400589, 0.5182239893969515, 0.4728323699421963, 0.49115044247787587, 0.4912718204488775, 0.34883720930232154, 0.34883720930232154, 0.35714285714285293, 0.4374999999999973, 0.4598540145985384, 0.44385026737967853, 0.4531249999999995, 0.0, 0.0, 0.0, 0.47175141242937785, 0.4650537634408596, 0.4452554744525542, 0.4403892944038924, 0.4641350210970459, 0.3812154696132586, 0.36815920398009855, 0.3466666666666659, 0.3348017621145367, 0.3347826086956514, 0.5194444444444437, 0.524999999999999, 0.5150375939849614, 0.4298245614035081, 0.40112994350282427, 0.4843749999999987, 0.4863387978142063, 0.5027624309392251, 0.48677248677248547, 0.550295857988164, 0.5390835579514818, 0.5426997245179056, 0.5401662049861489, 0.5316804407713491, 0.48019801980197957, 0.44067796610169363, 0.4035874439461874, 0.4112554112554103, 0.5694444444444436, 0.5630026809651467, 0.45771144278606907, 0.4292452830188674, 0.42777777777777715, 0.44910179640718495, 0.4973684210526309, 0.38759689922480467, 0.4150943396226402, 0.4150943396226402, 0.4046242774566462, 0.39830508474576226, 0.4091680814940574, 0.39491525423728785, 0.4063545150501669, 0.0] [0.591954022988504, 0.6121212121212102, 0.5913461538461524, 0.5472636815920384, 0.48519736842105227, 0.474226804123711, 0.4836488812392423, 0.5008726003490397, 0.46472392638036775, 0.5364963503649625, 0.5395189003436417, 0.5517241379310335, 0.5428571428571418, 0.510344827586206, 0.3987915407854979, 0.38216560509554076, 0.5680473372781047, 0.5587188612099634, 0.5590551181102354, 0.672514619883039, 0.6596858638743438, 0.6489361702127642, 0.6373626373626355, 0.6439790575916213, 0.5869565217391295, 0.5905292479108627, 0.5714285714285707, 0.5869017632241806, 0.5679287305122488, 0.5377358490566012, 0.5285714285714267, 0.5433070866141712, 0.4852071005917145, 0.4909090909090894, 0.4850299401197602, 0.46567505720823776, 0.47923681257014566, 0.4881087202718004, 0.4969635627530362, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6315789473684189, 0.6558441558441537, 0.6352941176470569, 0.6105769230769216, 0.6512820512820495, 0.554913294797687, 0.5073313782991195, 0.45244215938303284, 0.43749999999999944, 0.4089834515366425, 0.6136363636363622, 0.6103448275862058, 0.6167247386759571, 0.5927419354838698, 0.6144200626959238, 0.6116504854368922, 0.6099071207430331, 0.6180371352785138, 0.572580645161288, 0.6119402985074603, 0.5294117647058811, 0.4022662889518408, 0.42105263157894673, 0.6509695290858717, 0.6616161616161612, 0.655836341756919, 0.6599526066350707, 0.5409090909090897, 0.5255474452554735, 0.5226480836236924, 0.5522388059701472, 0.5673758865248206, 0.0, 0.5714285714285693, 0.58181818181818, 0.5838509316770167, 0.5833333333333314, 0.595141700404857, 0.48888888888888826, 0.4907651715039571, 0.466165413533834, 0.4592145015105733, 0.5476190476190471, 0.571428571428571, 0.5708661417322829, 0.5782442748091599, 0.573613766730401, 0.58050847457627, 0.5555555555555544, 0.5720338983050836, 0.5638629283489087, 0.5636942675159227, 0.5320855614973254, 0.4894736842105257, 0.48614609571788353, 0.5179153094462532, 0.5231316725978639, 0.5730158730158726, 0.5716586151368755, 0.5484848484848481, 0.5466666666666663, 0.5883424408014567, 0.540682414698162, 0.5524017467248902, 0.5445920303605308, 0.5333333333333329, 0.5505804311774457, 0.4993446920052422, 0.5225352112676053, 0.5481569560047559, 0.5788787483702734, 0.5761843790012801, 0.6121495327102796, 0.5725593667546166, 0.5844504021447713, 0.6071428571428561, 0.6057692307692297, 0.5369863013698623, 0.5316804407713491, 0.5531914893617013, 0.5350318471337571, 0.5289575289575279, 0.43333333333332974, 0.4838709677419316, 0.5090909090909045, 0.4770642201834819, 0.393333333333332, 0.5124999999999968, 0.5119047619047589, 0.46341463414633866, 0.46874999999999756, 0.4469273743016747, 0.4799999999999981, 0.48437499999999806, 0.4687499999999981, 0.48618784530386605, 0.480662983425413, 0.4882179675994107, 0.48177601060304825, 0.5271676300578032, 0.5088495575221237, 0.5087281795511219, 0.6511627906976669, 0.6511627906976669, 0.6428571428571352, 0.5624999999999966, 0.5401459854014579, 0.5561497326203201, 0.5468749999999993, 0.0, 0.0, 0.0, 0.5282485875706207, 0.5349462365591391, 0.5547445255474446, 0.5596107055961064, 0.535864978902953, 0.6187845303867385, 0.6318407960198988, 0.6533333333333319, 0.6651982378854611, 0.6652173913043463, 0.48055555555555485, 0.47499999999999915, 0.48496240601503665, 0.5701754385964903, 0.5988700564971743, 0.5156249999999987, 0.5136612021857909, 0.4972375690607721, 0.5132275132275118, 0.44970414201183295, 0.4609164420485169, 0.457300275482093, 0.45983379501384974, 0.4683195592286495, 0.5198019801980192, 0.5593220338983035, 0.5964125560538103, 0.5887445887445875, 0.43055555555555497, 0.43699731903485195, 0.5422885572139297, 0.5707547169811313, 0.5722222222222214, 0.5508982035928135, 0.5026315789473678, 0.6124031007751913, 0.5849056603773566, 0.5849056603773566, 0.5953757225433508, 0.6016949152542367, 0.5908319185059419, 0.6050847457627114, 0.5936454849498323, 0.0]
In [19]:
from scipy.stats import linregress
linregress(ls_negative_count,edf)
linregress(ls_positive_count,edf)
linregress(ls_negative_prec, edf)
linregress(ls_positive_prec, edf)
Out[19]:
LinregressResult(slope=-6.89614544208486e-06, intercept=0.05143096390515523, rvalue=-0.013627767916311054, pvalue=0.8481130253696473, stderr=3.5959095314200034e-05)
Out[19]:
LinregressResult(slope=6.834838553424016e-06, intercept=0.04677439087379673, rvalue=0.012504820609897912, pvalue=0.8604956414332507, stderr=3.884044896977195e-05)
Out[19]:
LinregressResult(slope=-0.034490007809650336, intercept=0.0667821067711464, rvalue=-0.03569743856283507, pvalue=0.6157845329186611, stderr=0.06861937435618914)
Out[19]:
LinregressResult(slope=0.040328454432028135, intercept=0.03149618787358385, rvalue=0.03633698193370003, pvalue=0.6094699342286296, stderr=0.07882123306274227)
In [21]:
plt.plot(range(0,200),ls_negative_prec[0:200])
plt.plot(range(0,200),edf[0:200], color = 'r')
plt.figure(figsize=(20,10))
Out[21]:
[<matplotlib.lines.Line2D at 0x1e54447c5f8>]
Out[21]:
[<matplotlib.lines.Line2D at 0x1e54447ce10>]
Out[21]:
<Figure size 1440x720 with 0 Axes>
<Figure size 1440x720 with 0 Axes>
In [ ]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
compound = []
compound_score = 0
n = 0
for text in risk_factor_text:
    text = str(text)
    n += 1
    if n % 100 == 0:
        print(n)
    ss = sid.polarity_scores(text)
    #print(ss)
#     for i, index in enumerate(ss):
#         if i == 'compound': 
#             compound_score = index
#             compound.append(compound_score)
    compound_score = ss['compound']
    compound.append(compound_score)
#print(compound)
C:\Users\yyang\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\twitter\__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
  warnings.warn("The twython library has not been installed. "
In [ ]:
from scipy.stats import linregress
linregress(compound,edf)
In [ ]:
kmv_data_RiskFactor_dropna2012 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2012]
risk_factor_text2012 = kmv_data_RiskFactor_dropna2012['risk_factor_text'].tolist()
risk_factor_text2012 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2012))

get_count(risk_factor_text2012)
In [10]:
##EDA
riskdict = []
textstr=""
#counttext=0
for text in risk_factor_text:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    tokens=[token.strip(string.punctuation) for token in tokens\
           if token not in stop_words]
    tokens=[token.strip() for token in tokens if token.strip()!='']
  
    riskdict+=tokens
12000
True
13000
True
14000
True
15000
True
16000
True
17000
True
18000
True
19000
True
20000
True
21000
True
22000
True
23000
True
In [ ]:
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
In [ ]:
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens \
              for word is not stop_words}

print(positive_dict)
In [33]:
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
Out[33]:
<Figure size 1440x720 with 0 Axes>
Out[33]:
<matplotlib.image.AxesImage at 0x18040fd6780>
Out[33]:
(-0.5, 799.5, 399.5, -0.5)
In [27]:
##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
negative_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(negative_tokens)
In [28]:
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)
{'risks': 147901, 'lack': 12186, 'limited': 81517, 'adverse': 186710, 'risk': 139417, 'unable': 85657, 'loss': 110424, 'losses': 98228, 'issues': 27054, 'delays': 31264, 'fail': 44086, 'stringent': 10854, 'lose': 21699, 'damages': 23266, 'suffer': 21327, 'harm': 39306, 'decline': 51081, 'inability': 31408, 'interruptions': 10441, 'declining': 5170, 'debt': 129145, 'failure': 72482, 'liability': 71723, 'fails': 7528, 'failures': 16420, 'critical': 15381, 'difficult': 44887, 'shortage': 2471, 'limit': 60958, 'limits': 20948, 'uncertain': 13544, 'unsuccessful': 5690, 'delay': 31217, 'volatile': 13087, 'problems': 18321, 'unwilling': 2550, 'expire': 8780, 'immaterial': 3379, 'impair': 15262, 'crisis': 5518, 'negative': 36344, 'insolvent': 2319, 'impaired': 11955, 'obsolete': 4547, 'disadvantage': 4584, 'inadequate': 6892, 'infringe': 5258, 'infringement': 11295, 'proprietary': 31486, 'insufficient': 7981, 'volatility': 27555, 'issue': 27980, 'errors': 9103, 'deficiencies': 5375, 'weaknesses': 4202, 'harmed': 10855, 'concerns': 12933, 'threat': 3196, 'breach': 18548, 'cautionary': 1267, 'impede': 4397, 'problem': 3697, 'disagree': 1366, 'unavailable': 3957, 'intense': 8077, 'warning': 2149, 'penalty': 3642, 'lengthy': 3362, 'defects': 6669, 'miss': 378, 'drain': 90, 'unlikely': 1421, 'disruptive': 2454, 'concern': 3651, 'weakening': 1326, 'expired': 2627, 'deficiency': 2902, 'slowly': 846, 'discriminate': 179, 'doubt': 599, 'surrender': 769, 'dispute': 2589, 'debts': 2557, 'losing': 1410, 'disagreed': 49, 'inferior': 221, 'split': 923, 'unachievable': 9, 'faults': 293, 'forbidden': 52}
In [29]:
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
Out[29]:
<Figure size 1440x720 with 0 Axes>
Out[29]:
<matplotlib.image.AxesImage at 0x17e5145f4a8>
Out[29]:
(-0.5, 799.5, 399.5, -0.5)
In [6]:
# for year 2012
kmv_data_RiskFactor_dropna2012 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2012]
risk_factor_text2012 = kmv_data_RiskFactor_dropna2012['risk_factor_text'].tolist()
risk_factor_text2012 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2012))
riskdict = []
textstr=""
counttext=0


    
for text in risk_factor_text2012:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens=[token.strip(string.punctuation) for token in tokens]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
        ##WordNetLemmatizer
    def get_wordnet_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ

        elif pos_tag.startswith('V'):
            return wordnet.VERB

        elif pos_tag.startswith('N'):
            return wordnet.NOUN

        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
              (word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              word not in string.punctuation]
   
    riskdict+=lemmatized_words
    
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens}

print(positive_dict)

## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
                 if token in negative_words]

#print(negative_tokens)

# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(negative_tokens)

## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)

## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
1000
True
2000
True
{'regard': 9111, 'well': 21947, 'successfully': 7362, 'successful': 6360, 'sufficiently': 518, 'variety': 5453, 'effectively': 5669, 'success': 6669, 'timely': 5882, 'favorable': 4981, 'benefit': 9746, 'right': 18661, 'reasonable': 3559, 'significant': 36900, 'reliable': 956, 'secure': 7845, 'assurance': 9765, 'effective': 7204, 'respect': 11065, 'important': 3582, 'enough': 595, 'improve': 4994, 'approval': 14772, 'sufficient': 8100, 'available': 17257, 'work': 8102, 'qualified': 3550, 'protect': 7061, 'leading': 285, 'commitment': 3517, 'advantage': 3108, 'integrated': 927, 'consummate': 855, 'outstanding': 8435, 'restructuring': 967, 'worth': 528, 'succeed': 748, 'qualify': 5205, 'diligently': 40, 'stability': 992, 'positively': 159, 'recover': 2899, 'positive': 993, 'achievement': 478, 'clean': 621, 'win': 464, 'achievable': 74, 'useful': 496, 'notably': 140, 'better': 76, 'harmless': 37, 'appreciable': 8, 'exceeded': 2}
Out[6]:
<Figure size 1440x720 with 0 Axes>
Out[6]:
<matplotlib.image.AxesImage at 0x1e1b0d61550>
Out[6]:
(-0.5, 799.5, 399.5, -0.5)
{'issue': 17773, 'immaterial': 584, 'loss': 37284, 'inability': 5365, 'insufficient': 1334, 'unable': 14994, 'decline': 13288, 'debt': 22793, 'suffer': 4092, 'difficult': 8044, 'delay': 12679, 'infringe': 1735, 'fail': 9559, 'expire': 3071, 'harm': 8628, 'breach': 4885, 'failure': 14331, 'proprietary': 5417, 'adverse': 31383, 'negative': 6029, 'delayed': 242, 'impair': 3765, 'limit': 21854, 'limited': 8229, 'problem': 3982, 'intense': 1473, 'uncertain': 2422, 'shortage': 1739, 'critical': 2440, 'liability': 22057, 'inadequate': 1097, 'dispute': 1935, 'impede': 924, 'disadvantage': 889, 'crisis': 1518, 'threat': 1175, 'infringement': 2039, 'expensive': 2447, 'obsolete': 822, 'unlikely': 276, 'stringent': 1791, 'drain': 36, 'lengthy': 598, 'volatility': 4874, 'lack': 2302, 'volatile': 2295, 'failures': 233, 'unavailable': 648, 'cautionary': 247, 'miss': 132, 'suffered': 3, 'harmed': 16, 'fails': 197, 'slowly': 138, 'delays': 30, 'split': 211, 'prohibitively': 66, 'faults': 1, 'risk': 3, 'limits': 4}
Out[6]:
<Figure size 1440x720 with 0 Axes>
Out[6]:
<matplotlib.image.AxesImage at 0x1e1b0fa5780>
Out[6]:
(-0.5, 799.5, 399.5, -0.5)
In [7]:
##for 2013
kmv_data_RiskFactor_dropna2013 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2013]
risk_factor_text2013 = kmv_data_RiskFactor_dropna2013['risk_factor_text'].tolist()
risk_factor_text2013 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2013))
riskdict = []
textstr=""
counttext=0


    
for text in risk_factor_text2013:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens=[token.strip(string.punctuation) for token in tokens]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
        ##WordNetLemmatizer
    def get_wordnet_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ

        elif pos_tag.startswith('V'):
            return wordnet.VERB

        elif pos_tag.startswith('N'):
            return wordnet.NOUN

        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
              (word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              word not in string.punctuation]
   
    riskdict+=lemmatized_words
    
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens}

print(positive_dict)

## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
                 if token in negative_words]

#print(negative_tokens)

# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(negative_tokens)

## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)

## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
1000
True
2000
True
{'protection': 7782, 'improve': 5673, 'available': 18797, 'outstanding': 9228, 'well': 24451, 'effectively': 6494, 'regard': 10287, 'significant': 41641, 'approval': 15717, 'recover': 3284, 'variety': 6077, 'enough': 637, 'favorable': 5423, 'protect': 7933, 'sufficient': 8983, 'work': 8984, 'successfully': 8138, 'qualify': 5788, 'clean': 674, 'integrated': 999, 'worth': 592, 'success': 7248, 'advantage': 3434, 'win': 523, 'successful': 6982, 'stability': 1040, 'effective': 8313, 'flexibility': 2311, 'secure': 8729, 'right': 20213, 'important': 4038, 'reliable': 1073, 'timely': 6626, 'respect': 12423, 'benefit': 11315, 'reasonable': 3994, 'consummate': 934, 'qualified': 4128, 'succeed': 796, 'restructuring': 1074, 'commitment': 3916, 'positive': 1099, 'useful': 565, 'sufficiently': 652, 'better': 81, 'leading': 285, 'notably': 145, 'exceeded': 4, 'positively': 184, 'achievement': 508, 'appreciable': 9, 'achievable': 77, 'diligently': 44, 'harmless': 40}
Out[7]:
<Figure size 1440x720 with 0 Axes>
Out[7]:
<matplotlib.image.AxesImage at 0x1e1b0c8e908>
Out[7]:
(-0.5, 799.5, 399.5, -0.5)
{'lack': 2491, 'decline': 14338, 'limit': 24093, 'adverse': 35470, 'delay': 14077, 'unable': 16570, 'loss': 40745, 'issue': 19775, 'fail': 10688, 'stringent': 2079, 'suffer': 4588, 'harm': 9654, 'expire': 3389, 'inability': 5897, 'debt': 25702, 'failure': 16377, 'liability': 24946, 'fails': 223, 'threat': 1537, 'failures': 287, 'crisis': 1454, 'breach': 6200, 'critical': 2880, 'difficult': 8802, 'shortage': 1942, 'volatile': 2550, 'problem': 4295, 'immaterial': 667, 'impair': 4157, 'insufficient': 1519, 'expensive': 2612, 'negative': 6890, 'limited': 9028, 'obsolete': 910, 'disadvantage': 1005, 'inadequate': 1293, 'dispute': 2220, 'infringe': 1899, 'proprietary': 5873, 'infringement': 2172, 'volatility': 5398, 'cautionary': 257, 'impede': 1012, 'delayed': 301, 'intense': 1569, 'unavailable': 731, 'uncertain': 2680, 'split': 209, 'lengthy': 652, 'unlikely': 293, 'miss': 146, 'drain': 39, 'delays': 32, 'slowly': 169, 'prohibitively': 69, 'harmed': 15, 'faults': 1, 'risk': 7, 'suffered': 1}
Out[7]:
<Figure size 1440x720 with 0 Axes>
Out[7]:
<matplotlib.image.AxesImage at 0x1e1afe89208>
Out[7]:
(-0.5, 799.5, 399.5, -0.5)
In [9]:
positive_dict2013 = positive_dict
print(positive_dict2013)
negative_dict2013 = negative_dict
print(negative_dict2013)
{'protection': 7782, 'improve': 5673, 'available': 18797, 'outstanding': 9228, 'well': 24451, 'effectively': 6494, 'regard': 10287, 'significant': 41641, 'approval': 15717, 'recover': 3284, 'variety': 6077, 'enough': 637, 'favorable': 5423, 'protect': 7933, 'sufficient': 8983, 'work': 8984, 'successfully': 8138, 'qualify': 5788, 'clean': 674, 'integrated': 999, 'worth': 592, 'success': 7248, 'advantage': 3434, 'win': 523, 'successful': 6982, 'stability': 1040, 'effective': 8313, 'flexibility': 2311, 'secure': 8729, 'right': 20213, 'important': 4038, 'reliable': 1073, 'timely': 6626, 'respect': 12423, 'benefit': 11315, 'reasonable': 3994, 'consummate': 934, 'qualified': 4128, 'succeed': 796, 'restructuring': 1074, 'commitment': 3916, 'positive': 1099, 'useful': 565, 'sufficiently': 652, 'better': 81, 'leading': 285, 'notably': 145, 'exceeded': 4, 'positively': 184, 'achievement': 508, 'appreciable': 9, 'achievable': 77, 'diligently': 44, 'harmless': 40}
{'lack': 2491, 'decline': 14338, 'limit': 24093, 'adverse': 35470, 'delay': 14077, 'unable': 16570, 'loss': 40745, 'issue': 19775, 'fail': 10688, 'stringent': 2079, 'suffer': 4588, 'harm': 9654, 'expire': 3389, 'inability': 5897, 'debt': 25702, 'failure': 16377, 'liability': 24946, 'fails': 223, 'threat': 1537, 'failures': 287, 'crisis': 1454, 'breach': 6200, 'critical': 2880, 'difficult': 8802, 'shortage': 1942, 'volatile': 2550, 'problem': 4295, 'immaterial': 667, 'impair': 4157, 'insufficient': 1519, 'expensive': 2612, 'negative': 6890, 'limited': 9028, 'obsolete': 910, 'disadvantage': 1005, 'inadequate': 1293, 'dispute': 2220, 'infringe': 1899, 'proprietary': 5873, 'infringement': 2172, 'volatility': 5398, 'cautionary': 257, 'impede': 1012, 'delayed': 301, 'intense': 1569, 'unavailable': 731, 'uncertain': 2680, 'split': 209, 'lengthy': 652, 'unlikely': 293, 'miss': 146, 'drain': 39, 'delays': 32, 'slowly': 169, 'prohibitively': 69, 'harmed': 15, 'faults': 1, 'risk': 7, 'suffered': 1}
In [10]:
##for 2014
kmv_data_RiskFactor_dropna2014 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2014]
risk_factor_text2014 = kmv_data_RiskFactor_dropna2014['risk_factor_text'].tolist()
risk_factor_text2014 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2014))
riskdict = []
textstr=""
counttext=0


    
for text in risk_factor_text2014:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens=[token.strip(string.punctuation) for token in tokens]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
        ##WordNetLemmatizer
    def get_wordnet_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ

        elif pos_tag.startswith('V'):
            return wordnet.VERB

        elif pos_tag.startswith('N'):
            return wordnet.NOUN

        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
              (word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              word not in string.punctuation]
   
    riskdict+=lemmatized_words
    
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens}

print(positive_dict)

## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
                 if token in negative_words]

#print(negative_tokens)

# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(negative_tokens)

## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)

## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
1000
True
2000
True
{'protection': 8151, 'improve': 5830, 'available': 19297, 'outstanding': 9396, 'well': 25699, 'significant': 43575, 'timely': 7069, 'recover': 3382, 'work': 9382, 'approval': 16322, 'right': 20891, 'respect': 12868, 'assurance': 11026, 'favorable': 5701, 'variety': 6354, 'adequate': 6020, 'stable': 709, 'enough': 653, 'effectively': 6926, 'regard': 10758, 'protect': 8554, 'sufficient': 9488, 'successfully': 8688, 'qualify': 6073, 'effective': 8774, 'flexibility': 2386, 'clean': 709, 'success': 7605, 'innovative': 1027, 'advantage': 3533, 'stability': 1069, 'secure': 8996, 'important': 4168, 'reliable': 1116, 'accurately': 1775, 'integrated': 1025, 'exceed': 5646, 'reasonable': 4249, 'consummate': 998, 'successful': 7298, 'confidence': 2345, 'qualified': 4456, 'effectiveness': 2069, 'succeed': 800, 'restructuring': 1171, 'advanced': 1444, 'commitment': 4011, 'suitable': 1724, 'robust': 293, 'sufficiently': 691, 'positive': 1186, 'fair': 5771, 'desirable': 694, 'accurate': 996, 'useful': 571, 'better': 73, 'leading': 279, 'facilitate': 1144, 'achievement': 540, 'worth': 586, 'win': 525, 'notably': 152, 'positively': 193, 'appreciable': 8, 'achievable': 77, 'harmless': 41, 'exceeded': 1}
Out[10]:
<Figure size 1440x720 with 0 Axes>
Out[10]:
<matplotlib.image.AxesImage at 0x1e1beae1208>
Out[10]:
(-0.5, 799.5, 399.5, -0.5)
{'lack': 2554, 'decline': 14373, 'limit': 25459, 'adverse': 37976, 'delay': 14748, 'debt': 26429, 'uncertain': 2762, 'loss': 42303, 'issue': 20446, 'fail': 11245, 'unsuccessful': 1150, 'stringent': 2164, 'lose': 5978, 'penalty': 7367, 'suffer': 4810, 'harm': 10184, 'unable': 17387, 'expire': 3345, 'inability': 6352, 'failure': 17783, 'liability': 26361, 'fails': 225, 'threat': 1805, 'breach': 7248, 'critical': 3127, 'concern': 5523, 'difficult': 9092, 'shortage': 1996, 'incorrect': 619, 'volatile': 2616, 'problem': 4472, 'immaterial': 705, 'impair': 4286, 'expensive': 2801, 'crisis': 1238, 'negative': 7365, 'limited': 9462, 'disadvantage': 1072, 'obsolete': 927, 'dispute': 2376, 'infringe': 1975, 'proprietary': 6225, 'infringement': 2351, 'volatility': 5551, 'impede': 1067, 'unfamiliar': 66, 'sue': 260, 'delayed': 330, 'intense': 1615, 'warning': 454, 'unavailable': 792, 'split': 206, 'aggressive': 505, 'miss': 147, 'lengthy': 686, 'drain': 35, 'disruptive': 514, 'delays': 26, 'failures': 324, 'insufficient': 1631, 'inadequate': 1400, 'omission': 441, 'slowly': 186, 'enjoin': 202, 'unlikely': 283, 'doubt': 132, 'prohibitively': 74, 'cautionary': 253, 'unproven': 80, 'weaknesses': 92, 'suffered': 3, 'harmed': 13, 'unachievable': 2, 'risk': 7, 'errors': 3}
Out[10]:
<Figure size 1440x720 with 0 Axes>
Out[10]:
<matplotlib.image.AxesImage at 0x1e1b0049978>
Out[10]:
(-0.5, 799.5, 399.5, -0.5)
In [11]:
positive_dict2014 = positive_dict
print(positive_dict2014)
negative_dict2014 = negative_dict
print(negative_dict2014)
{'protection': 8151, 'improve': 5830, 'available': 19297, 'outstanding': 9396, 'well': 25699, 'significant': 43575, 'timely': 7069, 'recover': 3382, 'work': 9382, 'approval': 16322, 'right': 20891, 'respect': 12868, 'assurance': 11026, 'favorable': 5701, 'variety': 6354, 'adequate': 6020, 'stable': 709, 'enough': 653, 'effectively': 6926, 'regard': 10758, 'protect': 8554, 'sufficient': 9488, 'successfully': 8688, 'qualify': 6073, 'effective': 8774, 'flexibility': 2386, 'clean': 709, 'success': 7605, 'innovative': 1027, 'advantage': 3533, 'stability': 1069, 'secure': 8996, 'important': 4168, 'reliable': 1116, 'accurately': 1775, 'integrated': 1025, 'exceed': 5646, 'reasonable': 4249, 'consummate': 998, 'successful': 7298, 'confidence': 2345, 'qualified': 4456, 'effectiveness': 2069, 'succeed': 800, 'restructuring': 1171, 'advanced': 1444, 'commitment': 4011, 'suitable': 1724, 'robust': 293, 'sufficiently': 691, 'positive': 1186, 'fair': 5771, 'desirable': 694, 'accurate': 996, 'useful': 571, 'better': 73, 'leading': 279, 'facilitate': 1144, 'achievement': 540, 'worth': 586, 'win': 525, 'notably': 152, 'positively': 193, 'appreciable': 8, 'achievable': 77, 'harmless': 41, 'exceeded': 1}
{'lack': 2554, 'decline': 14373, 'limit': 25459, 'adverse': 37976, 'delay': 14748, 'debt': 26429, 'uncertain': 2762, 'loss': 42303, 'issue': 20446, 'fail': 11245, 'unsuccessful': 1150, 'stringent': 2164, 'lose': 5978, 'penalty': 7367, 'suffer': 4810, 'harm': 10184, 'unable': 17387, 'expire': 3345, 'inability': 6352, 'failure': 17783, 'liability': 26361, 'fails': 225, 'threat': 1805, 'breach': 7248, 'critical': 3127, 'concern': 5523, 'difficult': 9092, 'shortage': 1996, 'incorrect': 619, 'volatile': 2616, 'problem': 4472, 'immaterial': 705, 'impair': 4286, 'expensive': 2801, 'crisis': 1238, 'negative': 7365, 'limited': 9462, 'disadvantage': 1072, 'obsolete': 927, 'dispute': 2376, 'infringe': 1975, 'proprietary': 6225, 'infringement': 2351, 'volatility': 5551, 'impede': 1067, 'unfamiliar': 66, 'sue': 260, 'delayed': 330, 'intense': 1615, 'warning': 454, 'unavailable': 792, 'split': 206, 'aggressive': 505, 'miss': 147, 'lengthy': 686, 'drain': 35, 'disruptive': 514, 'delays': 26, 'failures': 324, 'insufficient': 1631, 'inadequate': 1400, 'omission': 441, 'slowly': 186, 'enjoin': 202, 'unlikely': 283, 'doubt': 132, 'prohibitively': 74, 'cautionary': 253, 'unproven': 80, 'weaknesses': 92, 'suffered': 3, 'harmed': 13, 'unachievable': 2, 'risk': 7, 'errors': 3}
In [12]:
##for 2015
kmv_data_RiskFactor_dropna2015 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2015]
risk_factor_text2015 = kmv_data_RiskFactor_dropna2015['risk_factor_text'].tolist()
risk_factor_text2015 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2015))
riskdict = []
textstr=""
counttext=0


    
for text in risk_factor_text2015:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens=[token.strip(string.punctuation) for token in tokens]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
        ##WordNetLemmatizer
    def get_wordnet_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ

        elif pos_tag.startswith('V'):
            return wordnet.VERB

        elif pos_tag.startswith('N'):
            return wordnet.NOUN

        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
              (word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              word not in string.punctuation]
   
    riskdict+=lemmatized_words
    
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens}

print(positive_dict)

## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
                 if token in negative_words]

#print(negative_tokens)

# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(negative_tokens)

## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)

## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
1000
True
2000
True
{'protection': 8687, 'available': 19812, 'outstanding': 9799, 'well': 27195, 'exceed': 5828, 'respect': 13481, 'significant': 45386, 'timely': 7461, 'sufficient': 9835, 'effective': 9209, 'assurance': 11595, 'work': 9768, 'approval': 17320, 'right': 21628, 'successfully': 9057, 'favorable': 5893, 'variety': 6731, 'adequate': 6193, 'stable': 752, 'effectively': 7270, 'regard': 11106, 'protect': 9188, 'qualify': 6226, 'leverage': 4916, 'good': 4989, 'clean': 720, 'success': 7827, 'improve': 5721, 'advantage': 3624, 'stability': 1079, 'attractive': 2701, 'secure': 9221, 'important': 4285, 'reliable': 1192, 'accurately': 1871, 'integrated': 1059, 'recover': 3453, 'reasonable': 4371, 'consummate': 1067, 'successful': 7706, 'confidence': 2455, 'qualified': 4602, 'flexibility': 2522, 'effectiveness': 2241, 'succeed': 796, 'advanced': 1457, 'like': 1355, 'faith': 720, 'restructuring': 1308, 'positive': 1240, 'fair': 6028, 'accurate': 1041, 'leading': 297, 'sufficiently': 705, 'useful': 576, 'suitable': 1783, 'better': 74, 'facilitate': 1224, 'achievement': 565, 'notably': 162, 'positively': 199, 'win': 584, 'appreciable': 10, 'achievable': 81, 'harmless': 42, 'exceeded': 1}
Out[12]:
<Figure size 1440x720 with 0 Axes>
Out[12]:
<matplotlib.image.AxesImage at 0x1e1bf7360b8>
Out[12]:
(-0.5, 799.5, 399.5, -0.5)
{'lack': 2589, 'limit': 26168, 'uncertain': 2780, 'debt': 27437, 'unable': 17980, 'delay': 15585, 'loss': 43330, 'decline': 14940, 'unsuccessful': 1212, 'stringent': 2337, 'adverse': 39707, 'fail': 11906, 'lose': 6236, 'penalty': 8035, 'suffer': 4989, 'harm': 10763, 'expire': 3300, 'inability': 6726, 'failure': 18955, 'liability': 27663, 'fails': 241, 'threat': 2244, 'breach': 8698, 'critical': 3355, 'concern': 5567, 'issue': 21141, 'difficult': 9251, 'shortage': 2022, 'incorrect': 651, 'volatile': 2726, 'impair': 4445, 'limited': 9941, 'problem': 4600, 'expensive': 2874, 'crisis': 1108, 'negative': 7805, 'disadvantage': 1126, 'obsolete': 927, 'dispute': 2555, 'infringe': 2098, 'proprietary': 6820, 'infringement': 2437, 'volatility': 5638, 'impede': 1100, 'unfamiliar': 69, 'immaterial': 706, 'intense': 1680, 'warning': 458, 'unavailable': 854, 'miss': 163, 'lengthy': 711, 'drain': 39, 'disruptive': 546, 'delays': 26, 'failures': 357, 'cautionary': 248, 'inadequate': 1490, 'slowly': 174, 'unlikely': 281, 'doubt': 126, 'unproven': 66, 'weaknesses': 109, 'suffered': 4, 'split': 234, 'harmed': 13, 'expired': 46, 'faults': 1, 'unachievable': 3, 'risk': 5, 'errors': 4}
Out[12]:
<Figure size 1440x720 with 0 Axes>
Out[12]:
<matplotlib.image.AxesImage at 0x1e1be042390>
Out[12]:
(-0.5, 799.5, 399.5, -0.5)
In [13]:
positive_dict2015 = positive_dict
print(positive_dict2015)
negative_dict2015 = negative_dict
print(negative_dict2015)
{'protection': 8687, 'available': 19812, 'outstanding': 9799, 'well': 27195, 'exceed': 5828, 'respect': 13481, 'significant': 45386, 'timely': 7461, 'sufficient': 9835, 'effective': 9209, 'assurance': 11595, 'work': 9768, 'approval': 17320, 'right': 21628, 'successfully': 9057, 'favorable': 5893, 'variety': 6731, 'adequate': 6193, 'stable': 752, 'effectively': 7270, 'regard': 11106, 'protect': 9188, 'qualify': 6226, 'leverage': 4916, 'good': 4989, 'clean': 720, 'success': 7827, 'improve': 5721, 'advantage': 3624, 'stability': 1079, 'attractive': 2701, 'secure': 9221, 'important': 4285, 'reliable': 1192, 'accurately': 1871, 'integrated': 1059, 'recover': 3453, 'reasonable': 4371, 'consummate': 1067, 'successful': 7706, 'confidence': 2455, 'qualified': 4602, 'flexibility': 2522, 'effectiveness': 2241, 'succeed': 796, 'advanced': 1457, 'like': 1355, 'faith': 720, 'restructuring': 1308, 'positive': 1240, 'fair': 6028, 'accurate': 1041, 'leading': 297, 'sufficiently': 705, 'useful': 576, 'suitable': 1783, 'better': 74, 'facilitate': 1224, 'achievement': 565, 'notably': 162, 'positively': 199, 'win': 584, 'appreciable': 10, 'achievable': 81, 'harmless': 42, 'exceeded': 1}
{'lack': 2589, 'limit': 26168, 'uncertain': 2780, 'debt': 27437, 'unable': 17980, 'delay': 15585, 'loss': 43330, 'decline': 14940, 'unsuccessful': 1212, 'stringent': 2337, 'adverse': 39707, 'fail': 11906, 'lose': 6236, 'penalty': 8035, 'suffer': 4989, 'harm': 10763, 'expire': 3300, 'inability': 6726, 'failure': 18955, 'liability': 27663, 'fails': 241, 'threat': 2244, 'breach': 8698, 'critical': 3355, 'concern': 5567, 'issue': 21141, 'difficult': 9251, 'shortage': 2022, 'incorrect': 651, 'volatile': 2726, 'impair': 4445, 'limited': 9941, 'problem': 4600, 'expensive': 2874, 'crisis': 1108, 'negative': 7805, 'disadvantage': 1126, 'obsolete': 927, 'dispute': 2555, 'infringe': 2098, 'proprietary': 6820, 'infringement': 2437, 'volatility': 5638, 'impede': 1100, 'unfamiliar': 69, 'immaterial': 706, 'intense': 1680, 'warning': 458, 'unavailable': 854, 'miss': 163, 'lengthy': 711, 'drain': 39, 'disruptive': 546, 'delays': 26, 'failures': 357, 'cautionary': 248, 'inadequate': 1490, 'slowly': 174, 'unlikely': 281, 'doubt': 126, 'unproven': 66, 'weaknesses': 109, 'suffered': 4, 'split': 234, 'harmed': 13, 'expired': 46, 'faults': 1, 'unachievable': 3, 'risk': 5, 'errors': 4}
In [14]:
##for 2016
kmv_data_RiskFactor_dropna2016 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2016]
risk_factor_text2016 = kmv_data_RiskFactor_dropna2016['risk_factor_text'].tolist()
risk_factor_text2016 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2016))
riskdict = []
textstr=""
counttext=0


    
for text in risk_factor_text2016:
    text=str(text)
    counttext+=1
    if counttext % 1000 == 0:
        print(counttext)
        print(True)
    textstr+=text
    
    tokens = re.split(r"\W+", text)
    tokens = nltk.word_tokenize(text)   
    
    stop_words = stopwords.words('english')
    stop_words+=["risks", "risk", "competitive"]
    
    tokens=[token.strip(string.punctuation) for token in tokens]
    tokens=[token.strip() for token in tokens if token.strip()!='']
    
    tagged_tokens= nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
        ##WordNetLemmatizer
    def get_wordnet_pos(pos_tag):
        if pos_tag.startswith('J'):
            return wordnet.ADJ

        elif pos_tag.startswith('V'):
            return wordnet.VERB

        elif pos_tag.startswith('N'):
            return wordnet.NOUN

        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    lemmatized_words=[wordnet_lemmatizer.lemmatize\
              (word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              # remove stop words
              if word not in stop_words and \
              word not in string.punctuation]
   
    riskdict+=lemmatized_words
    
##Find positive words 
with open("positive-words.txt",'r') as f:
    positive_words=[line.strip() for line in f]

#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
                 if token in positive_words]

#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
    if token in positive_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                positive_tokens.append(token)
        else:
            positive_tokens.append(token)
#print(positive_tokens)

## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)    
positive_dict={word: word_dist[word] \
               for word in word_dist \
               if word in positive_tokens}

print(positive_dict)

## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


##Find negative words 
with open("negative-words.txt",'r') as f:
    negative_words=[line.strip() for line in f]

#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
                 if token in negative_words]

#print(negative_tokens)

# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
    if token in negative_words:
        if idx>0:
            if tokens[idx-1] not in negations:
                negative_tokens.append(token)
        else:
            negative_tokens.append(token)
#print(negative_tokens)

## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)    
negative_dict={word: word_dist[word] \
               for word in word_dist \
               if word in negative_tokens}

print(negative_dict)

## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
1000
True
2000
True
{'successfully': 9240, 'well': 27879, 'exceed': 6038, 'respect': 13835, 'lead': 10196, 'improve': 5837, 'succeed': 802, 'win': 579, 'advanced': 1494, 'work': 9826, 'timely': 7616, 'effective': 9396, 'variety': 7026, 'protection': 9159, 'good': 5051, 'clean': 758, 'favorable': 6004, 'integrated': 1066, 'success': 7835, 'advantage': 3730, 'significant': 46399, 'stability': 1104, 'stable': 762, 'approval': 17766, 'regard': 11436, 'secure': 9557, 'sufficient': 10108, 'right': 21754, 'reliable': 1220, 'accurately': 1921, 'protect': 9532, 'satisfactory': 1340, 'available': 20235, 'approve': 6999, 'reasonable': 4453, 'consummate': 1249, 'assurance': 12009, 'adequate': 6306, 'successful': 7833, 'confidence': 2506, 'appropriate': 5250, 'qualified': 4651, 'recover': 3453, 'effectively': 7429, 'outstanding': 10169, 'satisfy': 6127, 'flexibility': 2619, 'effectiveness': 2363, 'accurate': 1086, 'useful': 586, 'fair': 6246, 'faith': 743, 'restructuring': 1500, 'positive': 1313, 'fast': 608, 'satisfied': 252, 'promptly': 493, 'cure': 776, 'suitable': 1774, 'qualify': 6324, 'sufficiently': 731, 'leading': 307, 'notably': 170, 'facilitate': 1317, 'achievement': 562, 'orderly': 208, 'equitable': 241, 'better': 71, 'harmless': 42}
Out[14]:
<Figure size 1440x720 with 0 Axes>
Out[14]:
<matplotlib.image.AxesImage at 0x1e1d458d5c0>
Out[14]:
(-0.5, 799.5, 399.5, -0.5)
{'limit': 26569, 'delay': 16020, 'debt': 28614, 'critical': 3470, 'volatile': 2804, 'issue': 21526, 'stringent': 2428, 'concern': 5807, 'loss': 43941, 'adverse': 40900, 'unable': 18284, 'unwilling': 531, 'difficult': 9492, 'expire': 3313, 'immaterial': 688, 'impair': 4573, 'decline': 15679, 'lose': 6404, 'limited': 10215, 'unsuccessful': 1212, 'liability': 28323, 'insufficient': 1757, 'fail': 12157, 'crisis': 1030, 'negative': 8065, 'insolvent': 471, 'inability': 6904, 'failure': 19366, 'lack': 2590, 'disadvantage': 1154, 'suffer': 5030, 'penalty': 8546, 'impaired': 1424, 'deficiency': 1857, 'inadequate': 1552, 'harm': 11201, 'obsolete': 924, 'dispute': 2586, 'infringe': 2106, 'proprietary': 7050, 'infringement': 2421, 'threat': 2425, 'volatility': 5900, 'breach': 9463, 'problem': 4555, 'unavailable': 900, 'fails': 257, 'impede': 1100, 'intense': 1684, 'warning': 454, 'disagree': 385, 'uncertain': 2807, 'shortage': 1993, 'surrender': 311, 'split': 287, 'miss': 155, 'lengthy': 741, 'drain': 45, 'disruptive': 596, 'delays': 26, 'failures': 379, 'weakening': 208, 'expired': 54, 'cautionary': 258, 'slowly': 178, 'discriminate': 67, 'unlikely': 281, 'doubt': 171, 'inferior': 49, 'losing': 13, 'deficiencies': 40, 'weaknesses': 86, 'harmed': 19, 'risk': 7, 'forbidden': 5, 'disagreed': 4, 'debts': 4, 'errors': 4, 'interruptions': 1, 'unachievable': 1}
Out[14]:
<Figure size 1440x720 with 0 Axes>
Out[14]:
<matplotlib.image.AxesImage at 0x1e1b2032400>
Out[14]:
(-0.5, 799.5, 399.5, -0.5)
In [29]:
## all dicts for each year
pdict12 = {'regard': 9111, 'well': 21947, 'successfully': 7362, 'successful': 6360, 'sufficiently': 518, 'variety': 5453, 'effectively': 5669, 'success': 6669, 'timely': 5882, 'favorable': 4981, 'benefit': 9746, 'right': 18661, 'reasonable': 3559, 'significant': 36900, 'reliable': 956, 'secure': 7845, 'assurance': 9765, 'effective': 7204, 'respect': 11065, 'important': 3582, 'enough': 595, 'improve': 4994, 'approval': 14772, 'sufficient': 8100, 'available': 17257, 'work': 8102, 'qualified': 3550, 'protect': 7061, 'leading': 285, 'commitment': 3517, 'advantage': 3108, 'integrated': 927, 'consummate': 855, 'outstanding': 8435, 'restructuring': 967, 'worth': 528, 'succeed': 748, 'qualify': 5205, 'diligently': 40, 'stability': 992, 'positively': 159, 'recover': 2899, 'positive': 993, 'achievement': 478, 'clean': 621, 'win': 464, 'achievable': 74, 'useful': 496, 'notably': 140, 'better': 76, 'harmless': 37, 'appreciable': 8, 'exceeded': 2}
ndict12 = {'issue': 17773, 'immaterial': 584, 'loss': 37284, 'inability': 5365, 'insufficient': 1334, 'unable': 14994, 'decline': 13288, 'debt': 22793, 'suffer': 4092, 'difficult': 8044, 'delay': 12679, 'infringe': 1735, 'fail': 9559, 'expire': 3071, 'harm': 8628, 'breach': 4885, 'failure': 14331, 'proprietary': 5417, 'adverse': 31383, 'negative': 6029, 'delayed': 242, 'impair': 3765, 'limit': 21854, 'limited': 8229, 'problem': 3982, 'intense': 1473, 'uncertain': 2422, 'shortage': 1739, 'critical': 2440, 'liability': 22057, 'inadequate': 1097, 'dispute': 1935, 'impede': 924, 'disadvantage': 889, 'crisis': 1518, 'threat': 1175, 'infringement': 2039, 'expensive': 2447, 'obsolete': 822, 'unlikely': 276, 'stringent': 1791, 'drain': 36, 'lengthy': 598, 'volatility': 4874, 'lack': 2302, 'volatile': 2295, 'failures': 233, 'unavailable': 648, 'cautionary': 247, 'miss': 132, 'suffered': 3, 'harmed': 16, 'fails': 197, 'slowly': 138, 'delays': 30, 'split': 211, 'prohibitively': 66, 'faults': 1, 'risk': 3, 'limits': 4}

pdict13 = {'protection': 7782, 'improve': 5673, 'available': 18797, 'outstanding': 9228, 'well': 24451, 'effectively': 6494, 'regard': 10287, 'significant': 41641, 'approval': 15717, 'recover': 3284, 'variety': 6077, 'enough': 637, 'favorable': 5423, 'protect': 7933, 'sufficient': 8983, 'work': 8984, 'successfully': 8138, 'qualify': 5788, 'clean': 674, 'integrated': 999, 'worth': 592, 'success': 7248, 'advantage': 3434, 'win': 523, 'successful': 6982, 'stability': 1040, 'effective': 8313, 'flexibility': 2311, 'secure': 8729, 'right': 20213, 'important': 4038, 'reliable': 1073, 'timely': 6626, 'respect': 12423, 'benefit': 11315, 'reasonable': 3994, 'consummate': 934, 'qualified': 4128, 'succeed': 796, 'restructuring': 1074, 'commitment': 3916, 'positive': 1099, 'useful': 565, 'sufficiently': 652, 'better': 81, 'leading': 285, 'notably': 145, 'exceeded': 4, 'positively': 184, 'achievement': 508, 'appreciable': 9, 'achievable': 77, 'diligently': 44, 'harmless': 40}
ndict13 = {'lack': 2491, 'decline': 14338, 'limit': 24093, 'adverse': 35470, 'delay': 14077, 'unable': 16570, 'loss': 40745, 'issue': 19775, 'fail': 10688, 'stringent': 2079, 'suffer': 4588, 'harm': 9654, 'expire': 3389, 'inability': 5897, 'debt': 25702, 'failure': 16377, 'liability': 24946, 'fails': 223, 'threat': 1537, 'failures': 287, 'crisis': 1454, 'breach': 6200, 'critical': 2880, 'difficult': 8802, 'shortage': 1942, 'volatile': 2550, 'problem': 4295, 'immaterial': 667, 'impair': 4157, 'insufficient': 1519, 'expensive': 2612, 'negative': 6890, 'limited': 9028, 'obsolete': 910, 'disadvantage': 1005, 'inadequate': 1293, 'dispute': 2220, 'infringe': 1899, 'proprietary': 5873, 'infringement': 2172, 'volatility': 5398, 'cautionary': 257, 'impede': 1012, 'delayed': 301, 'intense': 1569, 'unavailable': 731, 'uncertain': 2680, 'split': 209, 'lengthy': 652, 'unlikely': 293, 'miss': 146, 'drain': 39, 'delays': 32, 'slowly': 169, 'prohibitively': 69, 'harmed': 15, 'faults': 1, 'risk': 7, 'suffered': 1}

pdict14 = {'protection': 8151, 'improve': 5830, 'available': 19297, 'outstanding': 9396, 'well': 25699, 'significant': 43575, 'timely': 7069, 'recover': 3382, 'work': 9382, 'approval': 16322, 'right': 20891, 'respect': 12868, 'assurance': 11026, 'favorable': 5701, 'variety': 6354, 'adequate': 6020, 'stable': 709, 'enough': 653, 'effectively': 6926, 'regard': 10758, 'protect': 8554, 'sufficient': 9488, 'successfully': 8688, 'qualify': 6073, 'effective': 8774, 'flexibility': 2386, 'clean': 709, 'success': 7605, 'innovative': 1027, 'advantage': 3533, 'stability': 1069, 'secure': 8996, 'important': 4168, 'reliable': 1116, 'accurately': 1775, 'integrated': 1025, 'exceed': 5646, 'reasonable': 4249, 'consummate': 998, 'successful': 7298, 'confidence': 2345, 'qualified': 4456, 'effectiveness': 2069, 'succeed': 800, 'restructuring': 1171, 'advanced': 1444, 'commitment': 4011, 'suitable': 1724, 'robust': 293, 'sufficiently': 691, 'positive': 1186, 'fair': 5771, 'desirable': 694, 'accurate': 996, 'useful': 571, 'better': 73, 'leading': 279, 'facilitate': 1144, 'achievement': 540, 'worth': 586, 'win': 525, 'notably': 152, 'positively': 193, 'appreciable': 8, 'achievable': 77, 'harmless': 41, 'exceeded': 1}
ndict14 = {'lack': 2554, 'decline': 14373, 'limit': 25459, 'adverse': 37976, 'delay': 14748, 'debt': 26429, 'uncertain': 2762, 'loss': 42303, 'issue': 20446, 'fail': 11245, 'unsuccessful': 1150, 'stringent': 2164, 'lose': 5978, 'penalty': 7367, 'suffer': 4810, 'harm': 10184, 'unable': 17387, 'expire': 3345, 'inability': 6352, 'failure': 17783, 'liability': 26361, 'fails': 225, 'threat': 1805, 'breach': 7248, 'critical': 3127, 'concern': 5523, 'difficult': 9092, 'shortage': 1996, 'incorrect': 619, 'volatile': 2616, 'problem': 4472, 'immaterial': 705, 'impair': 4286, 'expensive': 2801, 'crisis': 1238, 'negative': 7365, 'limited': 9462, 'disadvantage': 1072, 'obsolete': 927, 'dispute': 2376, 'infringe': 1975, 'proprietary': 6225, 'infringement': 2351, 'volatility': 5551, 'impede': 1067, 'unfamiliar': 66, 'sue': 260, 'delayed': 330, 'intense': 1615, 'warning': 454, 'unavailable': 792, 'split': 206, 'aggressive': 505, 'miss': 147, 'lengthy': 686, 'drain': 35, 'disruptive': 514, 'delays': 26, 'failures': 324, 'insufficient': 1631, 'inadequate': 1400, 'omission': 441, 'slowly': 186, 'enjoin': 202, 'unlikely': 283, 'doubt': 132, 'prohibitively': 74, 'cautionary': 253, 'unproven': 80, 'weaknesses': 92, 'suffered': 3, 'harmed': 13, 'unachievable': 2, 'risk': 7, 'errors': 3}

pdict15 = {'protection': 8687, 'available': 19812, 'outstanding': 9799, 'well': 27195, 'exceed': 5828, 'respect': 13481, 'significant': 45386, 'timely': 7461, 'sufficient': 9835, 'effective': 9209, 'assurance': 11595, 'work': 9768, 'approval': 17320, 'right': 21628, 'successfully': 9057, 'favorable': 5893, 'variety': 6731, 'adequate': 6193, 'stable': 752, 'effectively': 7270, 'regard': 11106, 'protect': 9188, 'qualify': 6226, 'leverage': 4916, 'good': 4989, 'clean': 720, 'success': 7827, 'improve': 5721, 'advantage': 3624, 'stability': 1079, 'attractive': 2701, 'secure': 9221, 'important': 4285, 'reliable': 1192, 'accurately': 1871, 'integrated': 1059, 'recover': 3453, 'reasonable': 4371, 'consummate': 1067, 'successful': 7706, 'confidence': 2455, 'qualified': 4602, 'flexibility': 2522, 'effectiveness': 2241, 'succeed': 796, 'advanced': 1457, 'like': 1355, 'faith': 720, 'restructuring': 1308, 'positive': 1240, 'fair': 6028, 'accurate': 1041, 'leading': 297, 'sufficiently': 705, 'useful': 576, 'suitable': 1783, 'better': 74, 'facilitate': 1224, 'achievement': 565, 'notably': 162, 'positively': 199, 'win': 584, 'appreciable': 10, 'achievable': 81, 'harmless': 42, 'exceeded': 1}
ndict15 = {'lack': 2589, 'limit': 26168, 'uncertain': 2780, 'debt': 27437, 'unable': 17980, 'delay': 15585, 'loss': 43330, 'decline': 14940, 'unsuccessful': 1212, 'stringent': 2337, 'adverse': 39707, 'fail': 11906, 'lose': 6236, 'penalty': 8035, 'suffer': 4989, 'harm': 10763, 'expire': 3300, 'inability': 6726, 'failure': 18955, 'liability': 27663, 'fails': 241, 'threat': 2244, 'breach': 8698, 'critical': 3355, 'concern': 5567, 'issue': 21141, 'difficult': 9251, 'shortage': 2022, 'incorrect': 651, 'volatile': 2726, 'impair': 4445, 'limited': 9941, 'problem': 4600, 'expensive': 2874, 'crisis': 1108, 'negative': 7805, 'disadvantage': 1126, 'obsolete': 927, 'dispute': 2555, 'infringe': 2098, 'proprietary': 6820, 'infringement': 2437, 'volatility': 5638, 'impede': 1100, 'unfamiliar': 69, 'immaterial': 706, 'intense': 1680, 'warning': 458, 'unavailable': 854, 'miss': 163, 'lengthy': 711, 'drain': 39, 'disruptive': 546, 'delays': 26, 'failures': 357, 'cautionary': 248, 'inadequate': 1490, 'slowly': 174, 'unlikely': 281, 'doubt': 126, 'unproven': 66, 'weaknesses': 109, 'suffered': 4, 'split': 234, 'harmed': 13, 'expired': 46, 'faults': 1, 'unachievable': 3, 'risk': 5, 'errors': 4}

pdict16 = {'successfully': 9240, 'well': 27879, 'exceed': 6038, 'respect': 13835, 'lead': 10196, 'improve': 5837, 'succeed': 802, 'win': 579, 'advanced': 1494, 'work': 9826, 'timely': 7616, 'effective': 9396, 'variety': 7026, 'protection': 9159, 'good': 5051, 'clean': 758, 'favorable': 6004, 'integrated': 1066, 'success': 7835, 'advantage': 3730, 'significant': 46399, 'stability': 1104, 'stable': 762, 'approval': 17766, 'regard': 11436, 'secure': 9557, 'sufficient': 10108, 'right': 21754, 'reliable': 1220, 'accurately': 1921, 'protect': 9532, 'satisfactory': 1340, 'available': 20235, 'approve': 6999, 'reasonable': 4453, 'consummate': 1249, 'assurance': 12009, 'adequate': 6306, 'successful': 7833, 'confidence': 2506, 'appropriate': 5250, 'qualified': 4651, 'recover': 3453, 'effectively': 7429, 'outstanding': 10169, 'satisfy': 6127, 'flexibility': 2619, 'effectiveness': 2363, 'accurate': 1086, 'useful': 586, 'fair': 6246, 'faith': 743, 'restructuring': 1500, 'positive': 1313, 'fast': 608, 'satisfied': 252, 'promptly': 493, 'cure': 776, 'suitable': 1774, 'qualify': 6324, 'sufficiently': 731, 'leading': 307, 'notably': 170, 'facilitate': 1317, 'achievement': 562, 'orderly': 208, 'equitable': 241, 'better': 71, 'harmless': 42}
ndict16 = {'limit': 26569, 'delay': 16020, 'debt': 28614, 'critical': 3470, 'volatile': 2804, 'issue': 21526, 'stringent': 2428, 'concern': 5807, 'loss': 43941, 'adverse': 40900, 'unable': 18284, 'unwilling': 531, 'difficult': 9492, 'expire': 3313, 'immaterial': 688, 'impair': 4573, 'decline': 15679, 'lose': 6404, 'limited': 10215, 'unsuccessful': 1212, 'liability': 28323, 'insufficient': 1757, 'fail': 12157, 'crisis': 1030, 'negative': 8065, 'insolvent': 471, 'inability': 6904, 'failure': 19366, 'lack': 2590, 'disadvantage': 1154, 'suffer': 5030, 'penalty': 8546, 'impaired': 1424, 'deficiency': 1857, 'inadequate': 1552, 'harm': 11201, 'obsolete': 924, 'dispute': 2586, 'infringe': 2106, 'proprietary': 7050, 'infringement': 2421, 'threat': 2425, 'volatility': 5900, 'breach': 9463, 'problem': 4555, 'unavailable': 900, 'fails': 257, 'impede': 1100, 'intense': 1684, 'warning': 454, 'disagree': 385, 'uncertain': 2807, 'shortage': 1993, 'surrender': 311, 'split': 287, 'miss': 155, 'lengthy': 741, 'drain': 45, 'disruptive': 596, 'delays': 26, 'failures': 379, 'weakening': 208, 'expired': 54, 'cautionary': 258, 'slowly': 178, 'discriminate': 67, 'unlikely': 281, 'doubt': 171, 'inferior': 49, 'losing': 13, 'deficiencies': 40, 'weaknesses': 86, 'harmed': 19, 'risk': 7, 'forbidden': 5, 'disagreed': 4, 'debts': 4, 'errors': 4, 'interruptions': 1, 'unachievable': 1}
In [34]:
import pandas as pd
# positive words, create a dataframe merge all years' positive words
ppd = pd.DataFrame(list(pdict12.items()), columns=['Positive Words', '2012'])
ppd13 = pd.DataFrame(list(pdict13.items()), columns=['Positive Words', '2013'])
#print(ppd13)
ppd14 = pd.DataFrame(list(pdict14.items()), columns=['Positive Words', '2014'])
ppd15 = pd.DataFrame(list(pdict15.items()), columns=['Positive Words', '2015'])
ppd16 = pd.DataFrame(list(pdict16.items()), columns=['Positive Words', '2016'])
pd = ppd.merge(ppd13, left_on='Positive Words', right_on='Positive Words', how='outer')
pd = pd.merge(ppd14, left_on='Positive Words', right_on='Positive Words', how='outer')
pd = pd.merge(ppd15, left_on='Positive Words', right_on='Positive Words', how='outer')
ppd = pd.merge(ppd16, left_on='Positive Words', right_on='Positive Words', how='outer')
ppd = ppd.fillna(0)
print(ppd)
   Positive Words     2012     2013     2014     2015     2016
0          regard   9111.0  10287.0  10758.0  11106.0  11436.0
1            well  21947.0  24451.0  25699.0  27195.0  27879.0
2    successfully   7362.0   8138.0   8688.0   9057.0   9240.0
3      successful   6360.0   6982.0   7298.0   7706.0   7833.0
4    sufficiently    518.0    652.0    691.0    705.0    731.0
5         variety   5453.0   6077.0   6354.0   6731.0   7026.0
6     effectively   5669.0   6494.0   6926.0   7270.0   7429.0
7         success   6669.0   7248.0   7605.0   7827.0   7835.0
8          timely   5882.0   6626.0   7069.0   7461.0   7616.0
9       favorable   4981.0   5423.0   5701.0   5893.0   6004.0
10        benefit   9746.0  11315.0      0.0      0.0      0.0
11          right  18661.0  20213.0  20891.0  21628.0  21754.0
12     reasonable   3559.0   3994.0   4249.0   4371.0   4453.0
13    significant  36900.0  41641.0  43575.0  45386.0  46399.0
14       reliable    956.0   1073.0   1116.0   1192.0   1220.0
15         secure   7845.0   8729.0   8996.0   9221.0   9557.0
16      assurance   9765.0      0.0  11026.0  11595.0  12009.0
17      effective   7204.0   8313.0   8774.0   9209.0   9396.0
18        respect  11065.0  12423.0  12868.0  13481.0  13835.0
19      important   3582.0   4038.0   4168.0   4285.0      0.0
20         enough    595.0    637.0    653.0      0.0      0.0
21        improve   4994.0   5673.0   5830.0   5721.0   5837.0
22       approval  14772.0  15717.0  16322.0  17320.0  17766.0
23     sufficient   8100.0   8983.0   9488.0   9835.0  10108.0
24      available  17257.0  18797.0  19297.0  19812.0  20235.0
25           work   8102.0   8984.0   9382.0   9768.0   9826.0
26      qualified   3550.0   4128.0   4456.0   4602.0   4651.0
27        protect   7061.0   7933.0   8554.0   9188.0   9532.0
28        leading    285.0    285.0    279.0    297.0    307.0
29     commitment   3517.0   3916.0   4011.0      0.0      0.0
..            ...      ...      ...      ...      ...      ...
55       adequate      0.0      0.0   6020.0   6193.0   6306.0
56         stable      0.0      0.0    709.0    752.0    762.0
57     innovative      0.0      0.0   1027.0      0.0      0.0
58     accurately      0.0      0.0   1775.0   1871.0   1921.0
59         exceed      0.0      0.0   5646.0   5828.0   6038.0
60     confidence      0.0      0.0   2345.0   2455.0   2506.0
61  effectiveness      0.0      0.0   2069.0   2241.0   2363.0
62       advanced      0.0      0.0   1444.0   1457.0   1494.0
63       suitable      0.0      0.0   1724.0   1783.0   1774.0
64         robust      0.0      0.0    293.0      0.0      0.0
65           fair      0.0      0.0   5771.0   6028.0   6246.0
66      desirable      0.0      0.0    694.0      0.0      0.0
67       accurate      0.0      0.0    996.0   1041.0   1086.0
68     facilitate      0.0      0.0   1144.0   1224.0   1317.0
69       leverage      0.0      0.0      0.0   4916.0      0.0
70           good      0.0      0.0      0.0   4989.0   5051.0
71     attractive      0.0      0.0      0.0   2701.0      0.0
72           like      0.0      0.0      0.0   1355.0      0.0
73          faith      0.0      0.0      0.0    720.0    743.0
74           lead      0.0      0.0      0.0      0.0  10196.0
75   satisfactory      0.0      0.0      0.0      0.0   1340.0
76        approve      0.0      0.0      0.0      0.0   6999.0
77    appropriate      0.0      0.0      0.0      0.0   5250.0
78        satisfy      0.0      0.0      0.0      0.0   6127.0
79           fast      0.0      0.0      0.0      0.0    608.0
80      satisfied      0.0      0.0      0.0      0.0    252.0
81       promptly      0.0      0.0      0.0      0.0    493.0
82           cure      0.0      0.0      0.0      0.0    776.0
83        orderly      0.0      0.0      0.0      0.0    208.0
84      equitable      0.0      0.0      0.0      0.0    241.0

[85 rows x 6 columns]
In [54]:
idx = []
for index, row in ppd.iterrows():
    if row['2012'] <= 5000 and row['2013'] <= 5000 and row['2014'] <= 5000 and row['2015'] <= 5000 and row['2016'] <= 5000:
        idx.append(index)
# print(idx)
ppd = ppd.drop(idx)
# print(ppd)
## bar chart 
ppd.set_index('Positive Words').plot(kind='bar', stacked=True, colormap='rainbow', \
   figsize=(10,7), title="Frequency of Positive Words by Year").\
   legend(loc='center left', bbox_to_anchor=(1, 0.5));
ppd.set_xlabel('Positive Words', fontsize=10)
Out[54]:
<matplotlib.legend.Legend at 0x1e54683ecc0>
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-54-95063848727d> in <module>()
      8 ## bar chart
      9 ppd.set_index('Positive Words').plot(kind='bar', stacked=True, colormap='rainbow',    figsize=(10,7), title="Frequency of Positive Words by Year").   legend(loc='center left', bbox_to_anchor=(1, 0.5));
---> 10 ppd.set_xlabel('Positive Words', fontsize=10)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   4374             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   4375                 return self[name]
-> 4376             return object.__getattribute__(self, name)
   4377 
   4378     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'set_xlabel'
In [55]:
import pandas as pd
In [56]:
npd = pd.DataFrame(list(ndict12.items()), columns=['negative Words', '2012'])
npd13 = pd.DataFrame(list(ndict13.items()), columns=['negative Words', '2013'])
#print(ppd13)
npd14 = pd.DataFrame(list(ndict14.items()), columns=['negative Words', '2014'])
npd15 = pd.DataFrame(list(ndict15.items()), columns=['negative Words', '2015'])
npd16 = pd.DataFrame(list(ndict16.items()), columns=['negative Words', '2016'])
npd = npd.merge(npd13, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd14, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd15, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd16, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.fillna(0)
print(npd)
   negative Words     2012     2013     2014     2015     2016
0           issue  17773.0  19775.0  20446.0  21141.0  21526.0
1      immaterial    584.0    667.0    705.0    706.0    688.0
2            loss  37284.0  40745.0  42303.0  43330.0  43941.0
3       inability   5365.0   5897.0   6352.0   6726.0   6904.0
4    insufficient   1334.0   1519.0   1631.0      0.0   1757.0
5          unable  14994.0  16570.0  17387.0  17980.0  18284.0
6         decline  13288.0  14338.0  14373.0  14940.0  15679.0
7            debt  22793.0  25702.0  26429.0  27437.0  28614.0
8          suffer   4092.0   4588.0   4810.0   4989.0   5030.0
9       difficult   8044.0   8802.0   9092.0   9251.0   9492.0
10          delay  12679.0  14077.0  14748.0  15585.0  16020.0
11       infringe   1735.0   1899.0   1975.0   2098.0   2106.0
12           fail   9559.0  10688.0  11245.0  11906.0  12157.0
13         expire   3071.0   3389.0   3345.0   3300.0   3313.0
14           harm   8628.0   9654.0  10184.0  10763.0  11201.0
15         breach   4885.0   6200.0   7248.0   8698.0   9463.0
16        failure  14331.0  16377.0  17783.0  18955.0  19366.0
17    proprietary   5417.0   5873.0   6225.0   6820.0   7050.0
18        adverse  31383.0  35470.0  37976.0  39707.0  40900.0
19       negative   6029.0   6890.0   7365.0   7805.0   8065.0
20        delayed    242.0    301.0    330.0      0.0      0.0
21         impair   3765.0   4157.0   4286.0   4445.0   4573.0
22          limit  21854.0  24093.0  25459.0  26168.0  26569.0
23        limited   8229.0   9028.0   9462.0   9941.0  10215.0
24        problem   3982.0   4295.0   4472.0   4600.0   4555.0
25        intense   1473.0   1569.0   1615.0   1680.0   1684.0
26      uncertain   2422.0   2680.0   2762.0   2780.0   2807.0
27       shortage   1739.0   1942.0   1996.0   2022.0   1993.0
28       critical   2440.0   2880.0   3127.0   3355.0   3470.0
29      liability  22057.0  24946.0  26361.0  27663.0  28323.0
..            ...      ...      ...      ...      ...      ...
63        concern      0.0      0.0   5523.0   5567.0   5807.0
64      incorrect      0.0      0.0    619.0    651.0      0.0
65     unfamiliar      0.0      0.0     66.0     69.0      0.0
66            sue      0.0      0.0    260.0      0.0      0.0
67        warning      0.0      0.0    454.0    458.0    454.0
68     aggressive      0.0      0.0    505.0      0.0      0.0
69     disruptive      0.0      0.0    514.0    546.0    596.0
70       omission      0.0      0.0    441.0      0.0      0.0
71         enjoin      0.0      0.0    202.0      0.0      0.0
72          doubt      0.0      0.0    132.0    126.0    171.0
73       unproven      0.0      0.0     80.0     66.0      0.0
74     weaknesses      0.0      0.0     92.0    109.0     86.0
75   unachievable      0.0      0.0      2.0      3.0      1.0
76         errors      0.0      0.0      3.0      4.0      4.0
77        expired      0.0      0.0      0.0     46.0     54.0
78      unwilling      0.0      0.0      0.0      0.0    531.0
79      insolvent      0.0      0.0      0.0      0.0    471.0
80       impaired      0.0      0.0      0.0      0.0   1424.0
81     deficiency      0.0      0.0      0.0      0.0   1857.0
82       disagree      0.0      0.0      0.0      0.0    385.0
83      surrender      0.0      0.0      0.0      0.0    311.0
84      weakening      0.0      0.0      0.0      0.0    208.0
85   discriminate      0.0      0.0      0.0      0.0     67.0
86       inferior      0.0      0.0      0.0      0.0     49.0
87         losing      0.0      0.0      0.0      0.0     13.0
88   deficiencies      0.0      0.0      0.0      0.0     40.0
89      forbidden      0.0      0.0      0.0      0.0      5.0
90      disagreed      0.0      0.0      0.0      0.0      4.0
91          debts      0.0      0.0      0.0      0.0      4.0
92  interruptions      0.0      0.0      0.0      0.0      1.0

[93 rows x 6 columns]
In [57]:
idx = []
for index, row in npd.iterrows():
    if row['2012'] <= 2000 and row['2013'] <= 2000 and row['2014'] <= 2000 and row['2015'] <= 2000 and row['2016'] <= 2000:
        idx.append(index)
print(idx)
npd = npd.drop(idx)
print(npd)
[1, 4, 20, 25, 30, 32, 33, 34, 38, 39, 41, 42, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92]
   negative Words     2012     2013     2014     2015     2016
0           issue  17773.0  19775.0  20446.0  21141.0  21526.0
2            loss  37284.0  40745.0  42303.0  43330.0  43941.0
3       inability   5365.0   5897.0   6352.0   6726.0   6904.0
5          unable  14994.0  16570.0  17387.0  17980.0  18284.0
6         decline  13288.0  14338.0  14373.0  14940.0  15679.0
7            debt  22793.0  25702.0  26429.0  27437.0  28614.0
8          suffer   4092.0   4588.0   4810.0   4989.0   5030.0
9       difficult   8044.0   8802.0   9092.0   9251.0   9492.0
10          delay  12679.0  14077.0  14748.0  15585.0  16020.0
11       infringe   1735.0   1899.0   1975.0   2098.0   2106.0
12           fail   9559.0  10688.0  11245.0  11906.0  12157.0
13         expire   3071.0   3389.0   3345.0   3300.0   3313.0
14           harm   8628.0   9654.0  10184.0  10763.0  11201.0
15         breach   4885.0   6200.0   7248.0   8698.0   9463.0
16        failure  14331.0  16377.0  17783.0  18955.0  19366.0
17    proprietary   5417.0   5873.0   6225.0   6820.0   7050.0
18        adverse  31383.0  35470.0  37976.0  39707.0  40900.0
19       negative   6029.0   6890.0   7365.0   7805.0   8065.0
21         impair   3765.0   4157.0   4286.0   4445.0   4573.0
22          limit  21854.0  24093.0  25459.0  26168.0  26569.0
23        limited   8229.0   9028.0   9462.0   9941.0  10215.0
24        problem   3982.0   4295.0   4472.0   4600.0   4555.0
26      uncertain   2422.0   2680.0   2762.0   2780.0   2807.0
27       shortage   1739.0   1942.0   1996.0   2022.0   1993.0
28       critical   2440.0   2880.0   3127.0   3355.0   3470.0
29      liability  22057.0  24946.0  26361.0  27663.0  28323.0
31        dispute   1935.0   2220.0   2376.0   2555.0   2586.0
35         threat   1175.0   1537.0   1805.0   2244.0   2425.0
36   infringement   2039.0   2172.0   2351.0   2437.0   2421.0
37      expensive   2447.0   2612.0   2801.0   2874.0      0.0
40      stringent   1791.0   2079.0   2164.0   2337.0   2428.0
43     volatility   4874.0   5398.0   5551.0   5638.0   5900.0
44           lack   2302.0   2491.0   2554.0   2589.0   2590.0
45       volatile   2295.0   2550.0   2616.0   2726.0   2804.0
61           lose      0.0      0.0   5978.0   6236.0   6404.0
62        penalty      0.0      0.0   7367.0   8035.0   8546.0
63        concern      0.0      0.0   5523.0   5567.0   5807.0
In [59]:
npd.set_index('negative Words').plot(kind='barh', stacked=True, colormap='rainbow', \
   figsize=(10,7), title="Frequency of Negative Words by Year").\
   legend(loc='center left', bbox_to_anchor=(1, 0.5));
In [ ]: